# 데이터 불러오기
data(iris)
table(is.na(iris)) #빈도 계산. False가 결측치 아닌거, True가 결측치인거.
##
## FALSE
## 750
colSums(is.na(iris)) #컬럼별 결측치
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
#Sepal.Length 컬럼으로 히스토그램 그려보자
summary(iris$Sepal.Length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.300 5.100 5.800 5.843 6.400 7.900
#히스토그램, 아이리스 데이터의 Sepal.Length를 이용할거고. x축의 라벨링 = xlab
#히스토그램의 색깔 정할땐 col. 제목은 main, xlim은 x축의 범위를 4.3~7.9로 지정.
hist(iris$Sepal.Length, xlab = "iris$Sepal.Length", col = "magenta",
main = "iris 꽃 받침 길이 Histogram", xlim = c(4.3, 7.9))

#Sepal.Width도 그려보자
summary(iris$Sepal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.800 3.000 3.057 3.300 4.400
hist(iris$Sepal.Width, xlab = "iris$Sepal.Width", col = "mistyrose",
main = "iris 꽃받침 너비 Histogram", xlim = c(2.0, 4.5))

#품종별 꽃받침 너비 누가 클깡
#y~x => y는 x에 대하여
#꽃받침 너비를 품종에 대하여~ 나타내보장~ 이런거지. 종에다른 꽃받침 너비.
boxplot(data = iris, Sepal.Width ~ Species)

# 품종별 꽃잎 길이 비교#평균
aggregate(Petal.Length ~ Species, data = iris, FUN = mean)
## Species Petal.Length
## 1 setosa 1.462
## 2 versicolor 4.260
## 3 virginica 5.552
#상관분석을 해보자. 각 변수간의 연관된 정도를 파악.
#영향을 미치는 변수는 무엇이냐~
test_s <- subset(iris[,1:4], iris$Species == 'setosa')
test_s
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
## 7 4.6 3.4 1.4 0.3
## 8 5.0 3.4 1.5 0.2
## 9 4.4 2.9 1.4 0.2
## 10 4.9 3.1 1.5 0.1
## 11 5.4 3.7 1.5 0.2
## 12 4.8 3.4 1.6 0.2
## 13 4.8 3.0 1.4 0.1
## 14 4.3 3.0 1.1 0.1
## 15 5.8 4.0 1.2 0.2
## 16 5.7 4.4 1.5 0.4
## 17 5.4 3.9 1.3 0.4
## 18 5.1 3.5 1.4 0.3
## 19 5.7 3.8 1.7 0.3
## 20 5.1 3.8 1.5 0.3
## 21 5.4 3.4 1.7 0.2
## 22 5.1 3.7 1.5 0.4
## 23 4.6 3.6 1.0 0.2
## 24 5.1 3.3 1.7 0.5
## 25 4.8 3.4 1.9 0.2
## 26 5.0 3.0 1.6 0.2
## 27 5.0 3.4 1.6 0.4
## 28 5.2 3.5 1.5 0.2
## 29 5.2 3.4 1.4 0.2
## 30 4.7 3.2 1.6 0.2
## 31 4.8 3.1 1.6 0.2
## 32 5.4 3.4 1.5 0.4
## 33 5.2 4.1 1.5 0.1
## 34 5.5 4.2 1.4 0.2
## 35 4.9 3.1 1.5 0.2
## 36 5.0 3.2 1.2 0.2
## 37 5.5 3.5 1.3 0.2
## 38 4.9 3.6 1.4 0.1
## 39 4.4 3.0 1.3 0.2
## 40 5.1 3.4 1.5 0.2
## 41 5.0 3.5 1.3 0.3
## 42 4.5 2.3 1.3 0.3
## 43 4.4 3.2 1.3 0.2
## 44 5.0 3.5 1.6 0.6
## 45 5.1 3.8 1.9 0.4
## 46 4.8 3.0 1.4 0.3
## 47 5.1 3.8 1.6 0.2
## 48 4.6 3.2 1.4 0.2
## 49 5.3 3.7 1.5 0.2
## 50 5.0 3.3 1.4 0.2
cor(test_s)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 0.7425467 0.2671758 0.2780984
## Sepal.Width 0.7425467 1.0000000 0.1777000 0.2327520
## Petal.Length 0.2671758 0.1777000 1.0000000 0.3316300
## Petal.Width 0.2780984 0.2327520 0.3316300 1.0000000
plot(test_s)

corrplot(cor(test_s))

#이건 꽃받침 빌이랑 너비가 관계있네
test_ver <- subset(iris[,1:4], iris$Species == 'versicolor')
test_ver
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 51 7.0 3.2 4.7 1.4
## 52 6.4 3.2 4.5 1.5
## 53 6.9 3.1 4.9 1.5
## 54 5.5 2.3 4.0 1.3
## 55 6.5 2.8 4.6 1.5
## 56 5.7 2.8 4.5 1.3
## 57 6.3 3.3 4.7 1.6
## 58 4.9 2.4 3.3 1.0
## 59 6.6 2.9 4.6 1.3
## 60 5.2 2.7 3.9 1.4
## 61 5.0 2.0 3.5 1.0
## 62 5.9 3.0 4.2 1.5
## 63 6.0 2.2 4.0 1.0
## 64 6.1 2.9 4.7 1.4
## 65 5.6 2.9 3.6 1.3
## 66 6.7 3.1 4.4 1.4
## 67 5.6 3.0 4.5 1.5
## 68 5.8 2.7 4.1 1.0
## 69 6.2 2.2 4.5 1.5
## 70 5.6 2.5 3.9 1.1
## 71 5.9 3.2 4.8 1.8
## 72 6.1 2.8 4.0 1.3
## 73 6.3 2.5 4.9 1.5
## 74 6.1 2.8 4.7 1.2
## 75 6.4 2.9 4.3 1.3
## 76 6.6 3.0 4.4 1.4
## 77 6.8 2.8 4.8 1.4
## 78 6.7 3.0 5.0 1.7
## 79 6.0 2.9 4.5 1.5
## 80 5.7 2.6 3.5 1.0
## 81 5.5 2.4 3.8 1.1
## 82 5.5 2.4 3.7 1.0
## 83 5.8 2.7 3.9 1.2
## 84 6.0 2.7 5.1 1.6
## 85 5.4 3.0 4.5 1.5
## 86 6.0 3.4 4.5 1.6
## 87 6.7 3.1 4.7 1.5
## 88 6.3 2.3 4.4 1.3
## 89 5.6 3.0 4.1 1.3
## 90 5.5 2.5 4.0 1.3
## 91 5.5 2.6 4.4 1.2
## 92 6.1 3.0 4.6 1.4
## 93 5.8 2.6 4.0 1.2
## 94 5.0 2.3 3.3 1.0
## 95 5.6 2.7 4.2 1.3
## 96 5.7 3.0 4.2 1.2
## 97 5.7 2.9 4.2 1.3
## 98 6.2 2.9 4.3 1.3
## 99 5.1 2.5 3.0 1.1
## 100 5.7 2.8 4.1 1.3
cor(test_ver)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 0.5259107 0.7540490 0.5464611
## Sepal.Width 0.5259107 1.0000000 0.5605221 0.6639987
## Petal.Length 0.7540490 0.5605221 1.0000000 0.7866681
## Petal.Width 0.5464611 0.6639987 0.7866681 1.0000000
plot(test_ver)

corrplot(cor(test_ver))

#이건 꽃받침 길이랑 꽃잎길이 | 꽃잎길이랑 꽃잎너비
test_vi <- subset(iris[,1:4], iris$Species == 'virginica')
test_vi
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 101 6.3 3.3 6.0 2.5
## 102 5.8 2.7 5.1 1.9
## 103 7.1 3.0 5.9 2.1
## 104 6.3 2.9 5.6 1.8
## 105 6.5 3.0 5.8 2.2
## 106 7.6 3.0 6.6 2.1
## 107 4.9 2.5 4.5 1.7
## 108 7.3 2.9 6.3 1.8
## 109 6.7 2.5 5.8 1.8
## 110 7.2 3.6 6.1 2.5
## 111 6.5 3.2 5.1 2.0
## 112 6.4 2.7 5.3 1.9
## 113 6.8 3.0 5.5 2.1
## 114 5.7 2.5 5.0 2.0
## 115 5.8 2.8 5.1 2.4
## 116 6.4 3.2 5.3 2.3
## 117 6.5 3.0 5.5 1.8
## 118 7.7 3.8 6.7 2.2
## 119 7.7 2.6 6.9 2.3
## 120 6.0 2.2 5.0 1.5
## 121 6.9 3.2 5.7 2.3
## 122 5.6 2.8 4.9 2.0
## 123 7.7 2.8 6.7 2.0
## 124 6.3 2.7 4.9 1.8
## 125 6.7 3.3 5.7 2.1
## 126 7.2 3.2 6.0 1.8
## 127 6.2 2.8 4.8 1.8
## 128 6.1 3.0 4.9 1.8
## 129 6.4 2.8 5.6 2.1
## 130 7.2 3.0 5.8 1.6
## 131 7.4 2.8 6.1 1.9
## 132 7.9 3.8 6.4 2.0
## 133 6.4 2.8 5.6 2.2
## 134 6.3 2.8 5.1 1.5
## 135 6.1 2.6 5.6 1.4
## 136 7.7 3.0 6.1 2.3
## 137 6.3 3.4 5.6 2.4
## 138 6.4 3.1 5.5 1.8
## 139 6.0 3.0 4.8 1.8
## 140 6.9 3.1 5.4 2.1
## 141 6.7 3.1 5.6 2.4
## 142 6.9 3.1 5.1 2.3
## 143 5.8 2.7 5.1 1.9
## 144 6.8 3.2 5.9 2.3
## 145 6.7 3.3 5.7 2.5
## 146 6.7 3.0 5.2 2.3
## 147 6.3 2.5 5.0 1.9
## 148 6.5 3.0 5.2 2.0
## 149 6.2 3.4 5.4 2.3
## 150 5.9 3.0 5.1 1.8
cor(test_vi)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 0.4572278 0.8642247 0.2811077
## Sepal.Width 0.4572278 1.0000000 0.4010446 0.5377280
## Petal.Length 0.8642247 0.4010446 1.0000000 0.3221082
## Petal.Width 0.2811077 0.5377280 0.3221082 1.0000000
plot(test_vi)

corrplot(cor(test_vi))

#꽃받침 길이랑 꽃잎 길이가 연관이 높다
#한번에 다 보면 어떻게 될까
cor(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
corrplot(cor(iris[, 1:4]))

#어우..
#회귀분석 : 두 변수간의 관계. linear model = lm
testvi_lm <- lm(Sepal.Length ~ Petal.Length, data = test_vi)
testvi_lm
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = test_vi)
##
## Coefficients:
## (Intercept) Petal.Length
## 1.0597 0.9957
summary(testvi_lm) #회귀모델 결과 확인
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = test_vi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73409 -0.23643 -0.03132 0.23771 0.76207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.05966 0.46677 2.27 0.0277 *
## Petal.Length 0.99574 0.08367 11.90 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3232 on 48 degrees of freedom
## Multiple R-squared: 0.7469, Adjusted R-squared: 0.7416
## F-statistic: 141.6 on 1 and 48 DF, p-value: 6.298e-16
#p값. 0.05보다 작다. 신뢰수준 95% 유의. 귀무가설x.
#R값. 결정계수가 1에 가까울수록 회귀모델의 성능이 뛰어나다.
names(testvi_lm)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
plot(test_vi$Sepal.Length, test_vi$Petal.Length)

#의사결정트리
# rpart 함수를 써서 분류
rpart_model <- rpart(Species ~ ., data = iris)
rpart_model
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
# 시각화
rpart.plot(rpart_model)

# Sepal.Length와 Sepal.Width의 산점도 그리기
plot(iris$Sepal.Length, iris$Sepal.Width,
xlab = "Sepal.Length", ylab = "Sepal.Width",
main = "Sepal.Length vs Sepal.Width Scatter Plot",
col = iris$Species)

# Sepal.Length와 Petal.Length의 산점도 그리기
plot(iris$Sepal.Length, iris$Petal.Length,
xlab = "Sepal.Length", ylab = "Petal.Length",
main = "Sepal.Length vs Petal.Length Scatter Plot",
col = iris$Species)

# Sepal.Length와 Petal.Width의 산점도 그리기
plot(iris$Sepal.Length, iris$Petal.Width,
xlab = "Sepal.Length", ylab = "Petal.Width",
main = "Sepal.Length vs Petal.Width Scatter Plot",
col = iris$Species)

# Sepal.Width와 Petal.Length의 산점도 그리기
plot(iris$Sepal.Width, iris$Petal.Length,
xlab = "Sepal.Width", ylab = "Petal.Length",
main = "Sepal.Width vs Petal.Length Scatter Plot",
col = iris$Species)

# Sepal.Width와 Petal.Width의 산점도 그리기
plot(iris$Sepal.Width, iris$Petal.Width,
xlab = "Sepal.Width", ylab = "Petal.Width",
main = "Sepal.Width vs Petal.Width Scatter Plot",
col = iris$Species)

# Petal.Length와 Petal.Width의 산점도 그리기
plot(iris$Petal.Length, iris$Petal.Width,
xlab = "Petal.Length", ylab = "Petal.Width",
main = "Petal.Length vs Petal.Width Scatter Plot",
col = iris$Species)

# Sepal.Length와 Sepal.Width의 산점도 그래프
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point() +
labs(title = "Sepal.Length vs. Sepal.Width",
x = "Sepal.Length",
y = "Sepal.Width",
color = "Species")

# Petal.Length와 Petal.Width의 산점도 그래프
ggplot(iris, aes(x = Sepal.Length, y = Petal.Width, color = Species)) +
geom_point() +
labs(title = "Sepal.Length vs. Sepal.Width",
x = "Sepal.Length",
y = "Sepal.Width",
color = "Species")

pairs(iris[, 1:4], col = iris$Species, pch = 19)

# 열 이름 너무 기니까 변경
names(iris) <- c("sl", "sw", "pl", "pw", "s")
# 훈련 데이터와 테스트 데이터 분리
set.seed(42)
trainIndex <- sample(1:nrow(iris), nrow(iris)*0.8)
trainSet <- iris[trainIndex, ]
testSet <- iris[-trainIndex, ]
# 랜덤 포레스트로 학습 및 예측
model_rf <- randomForest(s ~ ., data=trainSet, type="class")
rf.pred <- predict(model_rf, testSet)
rf.accuracy <- sum(rf.pred == testSet$s) / length(testSet$s)
print(paste("Random Forest Accuracy: ", rf.accuracy))
## [1] "Random Forest Accuracy: 0.933333333333333"
# Species를 수치형으로 변경하고 훈련 및 테스트 데이터를 행렬로 변환
trainSet$s <- as.numeric(trainSet$s) - 1
testSet$s <- as.numeric(testSet$s) - 1
trainSet_x <- as.matrix(trainSet[, -5])
trainSet_y <- trainSet[, 5]
testSet_x <- as.matrix(testSet[, -5])
testSet_y <- testSet[, 5]
# xgboost 파라미터 설정
params <- list("objective" = "multi:softprob",
"eval_metric" = "mlogloss",
"num_class" = 3)
# xgboost로 학습 및 예측
xgb.model <- xgboost(data = trainSet_x, label = trainSet_y, params = params, nrounds = 100)
## [1] train-mlogloss:0.734324
## [2] train-mlogloss:0.522261
## [3] train-mlogloss:0.385193
## [4] train-mlogloss:0.291406
## [5] train-mlogloss:0.225320
## [6] train-mlogloss:0.177849
## [7] train-mlogloss:0.143049
## [8] train-mlogloss:0.117298
## [9] train-mlogloss:0.098051
## [10] train-mlogloss:0.082183
## [11] train-mlogloss:0.069841
## [12] train-mlogloss:0.060162
## [13] train-mlogloss:0.051834
## [14] train-mlogloss:0.046324
## [15] train-mlogloss:0.041908
## [16] train-mlogloss:0.038578
## [17] train-mlogloss:0.035833
## [18] train-mlogloss:0.033422
## [19] train-mlogloss:0.031589
## [20] train-mlogloss:0.030162
## [21] train-mlogloss:0.029318
## [22] train-mlogloss:0.028246
## [23] train-mlogloss:0.027818
## [24] train-mlogloss:0.027204
## [25] train-mlogloss:0.026556
## [26] train-mlogloss:0.026178
## [27] train-mlogloss:0.025589
## [28] train-mlogloss:0.025013
## [29] train-mlogloss:0.024462
## [30] train-mlogloss:0.023902
## [31] train-mlogloss:0.023641
## [32] train-mlogloss:0.023310
## [33] train-mlogloss:0.022994
## [34] train-mlogloss:0.022758
## [35] train-mlogloss:0.022280
## [36] train-mlogloss:0.022048
## [37] train-mlogloss:0.021611
## [38] train-mlogloss:0.021394
## [39] train-mlogloss:0.021126
## [40] train-mlogloss:0.020916
## [41] train-mlogloss:0.020662
## [42] train-mlogloss:0.020475
## [43] train-mlogloss:0.020233
## [44] train-mlogloss:0.020016
## [45] train-mlogloss:0.019841
## [46] train-mlogloss:0.019675
## [47] train-mlogloss:0.019498
## [48] train-mlogloss:0.019332
## [49] train-mlogloss:0.019175
## [50] train-mlogloss:0.019011
## [51] train-mlogloss:0.018858
## [52] train-mlogloss:0.018720
## [53] train-mlogloss:0.018567
## [54] train-mlogloss:0.018425
## [55] train-mlogloss:0.018289
## [56] train-mlogloss:0.018146
## [57] train-mlogloss:0.018019
## [58] train-mlogloss:0.017889
## [59] train-mlogloss:0.017760
## [60] train-mlogloss:0.017644
## [61] train-mlogloss:0.017516
## [62] train-mlogloss:0.017397
## [63] train-mlogloss:0.017284
## [64] train-mlogloss:0.017163
## [65] train-mlogloss:0.017054
## [66] train-mlogloss:0.016951
## [67] train-mlogloss:0.016845
## [68] train-mlogloss:0.016750
## [69] train-mlogloss:0.016654
## [70] train-mlogloss:0.016562
## [71] train-mlogloss:0.016475
## [72] train-mlogloss:0.016384
## [73] train-mlogloss:0.016302
## [74] train-mlogloss:0.016219
## [75] train-mlogloss:0.016138
## [76] train-mlogloss:0.016063
## [77] train-mlogloss:0.015985
## [78] train-mlogloss:0.015910
## [79] train-mlogloss:0.015839
## [80] train-mlogloss:0.015768
## [81] train-mlogloss:0.015697
## [82] train-mlogloss:0.015628
## [83] train-mlogloss:0.015559
## [84] train-mlogloss:0.015491
## [85] train-mlogloss:0.015423
## [86] train-mlogloss:0.015360
## [87] train-mlogloss:0.015304
## [88] train-mlogloss:0.015252
## [89] train-mlogloss:0.015203
## [90] train-mlogloss:0.015159
## [91] train-mlogloss:0.015087
## [92] train-mlogloss:0.015044
## [93] train-mlogloss:0.015004
## [94] train-mlogloss:0.014936
## [95] train-mlogloss:0.014895
## [96] train-mlogloss:0.014858
## [97] train-mlogloss:0.014826
## [98] train-mlogloss:0.014795
## [99] train-mlogloss:0.014765
## [100] train-mlogloss:0.014695
xgb.pred <- predict(xgb.model, testSet_x)
xgb.pred <- matrix(xgb.pred, ncol = 3, byrow = TRUE)
xgb.pred.labels <- max.col(xgb.pred) - 1
xgb.accuracy <- sum(testSet_y == xgb.pred.labels) / length(testSet_y)
print(paste("XGBoost Accuracy: ", xgb.accuracy))
## [1] "XGBoost Accuracy: 0.966666666666667"
# 혼돈 행렬 출력
cm <- confusionMatrix(as.factor(xgb.pred.labels), as.factor(testSet_y))
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 9 0 0
## 1 0 10 0
## 2 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3667
## P-Value [Acc > NIR] : 4.476e-12
##
## Kappa : 0.9499
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 1.0 0.9091 1.0000
## Specificity 1.0 1.0000 0.9500
## Pos Pred Value 1.0 1.0000 0.9091
## Neg Pred Value 1.0 0.9500 1.0000
## Prevalence 0.3 0.3667 0.3333
## Detection Rate 0.3 0.3333 0.3333
## Detection Prevalence 0.3 0.3333 0.3667
## Balanced Accuracy 1.0 0.9545 0.9750
# xgboost 변수 중요도 시각화
importance_matrix <- xgb.importance(model = xgb.model)
xgb.plot.importance(importance_matrix)

# 정확도 비교
print(paste("정확도 차이(랜덤포레스트-xgboost): ", abs(rf.accuracy - xgb.accuracy)))
## [1] "정확도 차이(랜덤포레스트-xgboost): 0.0333333333333333"